# Import necessary libraries
import pandas as pd
# Load the dataset
data = pd.read_csv("aircraft_engine_maintenance_supervised_learning.csv")
data.head()
| Unnamed: 0 | Engine_ID | Timestamp | Temperature | Pressure | Rotational_Speed | Engine_Health | Fuel_Consumption | Vibration_Level | Oil_Temperature | Altitude | Humidity | Maintenance_Needed | Remaining_used_life | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 2023-01-01 0:00:00 | 529.106110 | 284.745866 | 3068.290695 | 1.764369 | 49.086087 | 9.164463 | 61.971112 | 1084.991320 | 55.206168 | 1 | 27.3 |
| 1 | 1 | 1 | 2023-01-01 0:10:00 | 499.696207 | 270.464475 | 2693.907436 | 1.265819 | 47.371939 | 7.289872 | 79.359045 | 1016.557471 | 64.382801 | 1 | 21.7 |
| 2 | 2 | 1 | 2023-01-01 0:20:00 | 480.154652 | 258.314496 | 3139.764855 | 1.311214 | 40.736703 | 7.144859 | 88.969967 | 902.039503 | 58.907223 | 1 | 22.3 |
| 3 | 3 | 1 | 2023-01-01 0:30:00 | 531.114934 | 281.212266 | 3043.335177 | 1.128121 | 56.214109 | 6.547841 | 73.004457 | 408.693047 | 50.655640 | 0 | 20.9 |
| 4 | 4 | 1 | 2023-01-01 0:40:00 | 499.197208 | 283.305874 | 3134.974852 | 1.567308 | 47.607585 | 8.252557 | 97.416898 | 1217.009675 | 48.393398 | 1 | 25.0 |
data.tail()
| Unnamed: 0 | Engine_ID | Timestamp | Temperature | Pressure | Rotational_Speed | Engine_Health | Fuel_Consumption | Vibration_Level | Oil_Temperature | Altitude | Humidity | Maintenance_Needed | Remaining_used_life | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 715 | 715 | 5 | 2023-01-01 23:10:00 | 414.677212 | 224.616493 | 2739.934722 | 1.251797 | 44.823469 | 6.795081 | 66.359294 | 1012.825820 | 47.324979 | 0 | 20.30 |
| 716 | 716 | 5 | 2023-01-01 23:20:00 | 534.222365 | 284.246641 | 2963.896803 | 1.266641 | 47.426626 | 7.050664 | 79.428251 | 998.980476 | 32.267225 | 1 | 22.00 |
| 717 | 717 | 5 | 2023-01-01 23:30:00 | 488.910939 | 262.138912 | 3209.008581 | 1.783258 | 44.454666 | 9.373671 | 97.021645 | 1251.538058 | 40.017720 | 0 | 27.60 |
| 718 | 718 | 5 | 2023-01-01 23:40:00 | NaN | 271.861935 | 2915.592050 | 1.619702 | NaN | 8.440549 | 78.458211 | 1003.570383 | 45.091718 | 1 | 22.65 |
| 719 | 719 | 5 | 2023-01-01 23:50:00 | 415.567779 | 222.505995 | 3174.985740 | 2.075334 | 42.598441 | 10.429919 | 86.549065 | 1254.175362 | 46.596882 | 1 | 29.40 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 720 entries, 0 to 719 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 720 non-null int64 1 Engine_ID 720 non-null int64 2 Timestamp 720 non-null object 3 Temperature 686 non-null float64 4 Pressure 681 non-null float64 5 Rotational_Speed 720 non-null float64 6 Engine_Health 720 non-null float64 7 Fuel_Consumption 650 non-null float64 8 Vibration_Level 720 non-null float64 9 Oil_Temperature 720 non-null float64 10 Altitude 720 non-null float64 11 Humidity 720 non-null float64 12 Maintenance_Needed 720 non-null int64 13 Remaining_used_life 720 non-null float64 dtypes: float64(10), int64(3), object(1) memory usage: 78.9+ KB
data.describe()
| Unnamed: 0 | Engine_ID | Temperature | Pressure | Rotational_Speed | Engine_Health | Fuel_Consumption | Vibration_Level | Oil_Temperature | Altitude | Humidity | Maintenance_Needed | Remaining_used_life | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 720.000000 | 720.000000 | 686.000000 | 681.000000 | 720.000000 | 720.000000 | 650.000000 | 720.000000 | 720.000000 | 720.000000 | 720.000000 | 720.000000 | 720.000000 |
| mean | 359.500000 | 3.000000 | 500.071917 | 269.592322 | 2989.594389 | 1.356543 | 46.492836 | 7.419655 | 80.488982 | 1001.442687 | 49.691074 | 0.644444 | 22.720556 |
| std | 207.990384 | 1.415197 | 50.110265 | 25.519526 | 204.452023 | 0.336493 | 4.485993 | 1.353611 | 9.961242 | 211.774371 | 10.333495 | 0.479014 | 3.448688 |
| min | 0.000000 | 1.000000 | 351.604344 | 194.364483 | 2398.058779 | 0.384061 | 32.476898 | 3.486958 | 51.502253 | 372.016586 | 12.655992 | 0.000000 | 13.300000 |
| 25% | 179.750000 | 2.000000 | 467.591610 | 253.859578 | 2841.774479 | 1.131308 | 43.691781 | 6.517824 | 73.958559 | 852.427213 | 42.675328 | 0.000000 | 20.500000 |
| 50% | 359.500000 | 3.000000 | 499.766139 | 270.002663 | 2993.425924 | 1.348170 | 46.384867 | 7.381727 | 80.355930 | 1012.950873 | 49.692443 | 1.000000 | 22.650000 |
| 75% | 539.250000 | 4.000000 | 533.696565 | 286.072363 | 3134.710998 | 1.574788 | 49.386030 | 8.312738 | 87.391627 | 1134.953476 | 56.663203 | 1.000000 | 25.000000 |
| max | 719.000000 | 5.000000 | 669.467631 | 357.208273 | 3559.917040 | 2.603570 | 67.240246 | 12.469578 | 109.728101 | 1741.281836 | 89.415037 | 1.000000 | 35.600000 |
# Check missing values using msno
import missingno as msno
msno.bar(data, color='grey')
<Axes: >
# Display columns with missing values
import seaborn as sns
sns.heatmap(data.isnull())
<Axes: >
# Impute missing values (replace with median) on numerical columns
numerical_columns = ["Engine_ID", "Temperature", "Pressure", "Rotational_Speed", "Engine_Health",
'Fuel_Consumption', 'Vibration_Level', 'Oil_Temperature', 'Altitude',
'Humidity', 'Maintenance_Needed']
data.fillna(data[numerical_columns].median(), inplace=True)
numerical_columns
['Engine_ID', 'Temperature', 'Pressure', 'Rotational_Speed', 'Engine_Health', 'Fuel_Consumption', 'Vibration_Level', 'Oil_Temperature', 'Altitude', 'Humidity', 'Maintenance_Needed']
data.isnull().sum()
Unnamed: 0 0 Engine_ID 0 Timestamp 0 Temperature 0 Pressure 0 Rotational_Speed 0 Engine_Health 0 Fuel_Consumption 0 Vibration_Level 0 Oil_Temperature 0 Altitude 0 Humidity 0 Maintenance_Needed 0 Remaining_used_life 0 dtype: int64
# Check missing values using msno after filling with median value
import missingno as msno
msno.bar(data, color='grey')
<Axes: >
# plotting boxplot to visualise outliers
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
# Univariate analysis (Numerical columns) - use boxplot to view all numerical columns with outliers.
columns_to_check = ['Temperature','Pressure','Rotational_Speed','Engine_Health','Fuel_Consumption',
'Vibration_Level','Oil_Temperature','Altitude','Humidity']
plt.figure(figsize=(18, 10))
for i, cols in enumerate(columns_to_check, 1):
plt.subplot(2, 5, i)
sns.boxplot(data[cols])
plt.xlabel(cols)
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()
# Univariate analysis (Target column i.e. Maintenance_Needed) - Countplot
plt.figure(figsize=(6,4))
sns.countplot(data=data, x="Maintenance_Needed")
<Axes: xlabel='Maintenance_Needed', ylabel='count'>
# Bivariate analysis - Pairplot
sns.pairplot(data[columns_to_check + ["Maintenance_Needed", "Remaining_used_life"]], hue="Maintenance_Needed")
plt.show()
# Multivariate analysis - Correlation Heatmap
correlation_matrix = data[columns_to_check + ["Maintenance_Needed", "Remaining_used_life"]].corr()
correlation_matrix
| Temperature | Pressure | Rotational_Speed | Engine_Health | Fuel_Consumption | Vibration_Level | Oil_Temperature | Altitude | Humidity | Maintenance_Needed | Remaining_used_life | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Temperature | 1.000000 | 0.940519 | 0.046193 | 0.367859 | 0.699923 | 0.365465 | 0.016888 | -0.044562 | 0.007619 | 0.150548 | 0.446575 |
| Pressure | 0.940519 | 1.000000 | 0.037158 | 0.363584 | 0.720393 | 0.360060 | 0.004292 | -0.064315 | -0.005532 | 0.135145 | 0.426118 |
| Rotational_Speed | 0.046193 | 0.037158 | 1.000000 | 0.154165 | 0.063515 | 0.156965 | -0.089808 | -0.033462 | -0.073330 | 0.176332 | 0.273728 |
| Engine_Health | 0.367859 | 0.363584 | 0.154165 | 1.000000 | 0.245414 | 0.997212 | -0.015014 | 0.028417 | -0.047519 | 0.349076 | 0.971913 |
| Fuel_Consumption | 0.699923 | 0.720393 | 0.063515 | 0.245414 | 1.000000 | 0.246541 | 0.039242 | -0.116406 | -0.008178 | 0.097970 | 0.306648 |
| Vibration_Level | 0.365465 | 0.360060 | 0.156965 | 0.997212 | 0.246541 | 1.000000 | -0.012863 | 0.029495 | -0.046418 | 0.348684 | 0.973311 |
| Oil_Temperature | 0.016888 | 0.004292 | -0.089808 | -0.015014 | 0.039242 | -0.012863 | 1.000000 | 0.050233 | 0.036015 | 0.140488 | -0.013443 |
| Altitude | -0.044562 | -0.064315 | -0.033462 | 0.028417 | -0.116406 | 0.029495 | 0.050233 | 1.000000 | 0.045176 | 0.038855 | 0.025921 |
| Humidity | 0.007619 | -0.005532 | -0.073330 | -0.047519 | -0.008178 | -0.046418 | 0.036015 | 0.045176 | 1.000000 | -0.099261 | -0.051098 |
| Maintenance_Needed | 0.150548 | 0.135145 | 0.176332 | 0.349076 | 0.097970 | 0.348684 | 0.140488 | 0.038855 | -0.099261 | 1.000000 | 0.361361 |
| Remaining_used_life | 0.446575 | 0.426118 | 0.273728 | 0.971913 | 0.306648 | 0.973311 | -0.013443 | 0.025921 | -0.051098 | 0.361361 | 1.000000 |
# correllation heatmap showing levl of correlation in the features
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
Text(0.5, 1.0, 'Correlation Heatmap')
# Create a Time-Related Feature (hour_of_Day)
dataCopy = data.copy()
dataCopy['Timestamp'] = pd.to_datetime(dataCopy['Timestamp'], errors='coerce')
dataCopy = dataCopy.dropna(subset=['Timestamp'])
dataCopy['Hour_of_Day'] = dataCopy['Timestamp'].dt.hour
#display(data.dtypes)
dataCopy.head()
| Unnamed: 0 | Engine_ID | Timestamp | Temperature | Pressure | Rotational_Speed | Engine_Health | Fuel_Consumption | Vibration_Level | Oil_Temperature | Altitude | Humidity | Maintenance_Needed | Remaining_used_life | Hour_of_Day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 2023-01-01 00:00:00 | 529.106110 | 284.745866 | 3068.290695 | 1.764369 | 49.086087 | 9.164463 | 61.971112 | 1084.991320 | 55.206168 | 1 | 27.3 | 0 |
| 1 | 1 | 1 | 2023-01-01 00:10:00 | 499.696207 | 270.464475 | 2693.907436 | 1.265819 | 47.371939 | 7.289872 | 79.359045 | 1016.557471 | 64.382801 | 1 | 21.7 | 0 |
| 2 | 2 | 1 | 2023-01-01 00:20:00 | 480.154652 | 258.314496 | 3139.764855 | 1.311214 | 40.736703 | 7.144859 | 88.969967 | 902.039503 | 58.907223 | 1 | 22.3 | 0 |
| 3 | 3 | 1 | 2023-01-01 00:30:00 | 531.114934 | 281.212266 | 3043.335177 | 1.128121 | 56.214109 | 6.547841 | 73.004457 | 408.693047 | 50.655640 | 0 | 20.9 | 0 |
| 4 | 4 | 1 | 2023-01-01 00:40:00 | 499.197208 | 283.305874 | 3134.974852 | 1.567308 | 47.607585 | 8.252557 | 97.416898 | 1217.009675 | 48.393398 | 1 | 25.0 | 0 |
# Binning 'Hour_of_Day'
bins = [-1, 6, 12, 18, 24]
labels = ['Night', 'Morning', 'Afternoon', 'Evening']
dataCopy['Day_Part'] = pd.cut(dataCopy['Hour_of_Day'], bins=bins, labels=labels, right=False)
# Display the dataset with the new feature
dataCopy.head()
| Unnamed: 0 | Engine_ID | Timestamp | Temperature | Pressure | Rotational_Speed | Engine_Health | Fuel_Consumption | Vibration_Level | Oil_Temperature | Altitude | Humidity | Maintenance_Needed | Remaining_used_life | Hour_of_Day | Day_Part | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 2023-01-01 00:00:00 | 529.106110 | 284.745866 | 3068.290695 | 1.764369 | 49.086087 | 9.164463 | 61.971112 | 1084.991320 | 55.206168 | 1 | 27.3 | 0 | Night |
| 1 | 1 | 1 | 2023-01-01 00:10:00 | 499.696207 | 270.464475 | 2693.907436 | 1.265819 | 47.371939 | 7.289872 | 79.359045 | 1016.557471 | 64.382801 | 1 | 21.7 | 0 | Night |
| 2 | 2 | 1 | 2023-01-01 00:20:00 | 480.154652 | 258.314496 | 3139.764855 | 1.311214 | 40.736703 | 7.144859 | 88.969967 | 902.039503 | 58.907223 | 1 | 22.3 | 0 | Night |
| 3 | 3 | 1 | 2023-01-01 00:30:00 | 531.114934 | 281.212266 | 3043.335177 | 1.128121 | 56.214109 | 6.547841 | 73.004457 | 408.693047 | 50.655640 | 0 | 20.9 | 0 | Night |
| 4 | 4 | 1 | 2023-01-01 00:40:00 | 499.197208 | 283.305874 | 3134.974852 | 1.567308 | 47.607585 | 8.252557 | 97.416898 | 1217.009675 | 48.393398 | 1 | 25.0 | 0 | Night |
# Feature selections
labels = data[['Maintenance_Needed']]
features = data.drop(['Maintenance_Needed','Unnamed: 0','Engine_ID','Timestamp', 'Remaining_used_life'], axis=1)
features.head()
| Temperature | Pressure | Rotational_Speed | Engine_Health | Fuel_Consumption | Vibration_Level | Oil_Temperature | Altitude | Humidity | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 529.106110 | 284.745866 | 3068.290695 | 1.764369 | 49.086087 | 9.164463 | 61.971112 | 1084.991320 | 55.206168 |
| 1 | 499.696207 | 270.464475 | 2693.907436 | 1.265819 | 47.371939 | 7.289872 | 79.359045 | 1016.557471 | 64.382801 |
| 2 | 480.154652 | 258.314496 | 3139.764855 | 1.311214 | 40.736703 | 7.144859 | 88.969967 | 902.039503 | 58.907223 |
| 3 | 531.114934 | 281.212266 | 3043.335177 | 1.128121 | 56.214109 | 6.547841 | 73.004457 | 408.693047 | 50.655640 |
| 4 | 499.197208 | 283.305874 | 3134.974852 | 1.567308 | 47.607585 | 8.252557 | 97.416898 | 1217.009675 | 48.393398 |
print(labels.shape)
print(labels.squeeze().shape)
(720, 1) (720,)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels.squeeze(), test_size=0.2, random_state=0)
# Standardize our training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.select_dtypes(include=['number']))
X_test_scaled = scaler.transform(X_test.select_dtypes(include=['number']))
X_train_scaled.shape
(576, 9)
X_test_scaled.shape
(144, 9)
# Logistic Regression
from sklearn.linear_model import LogisticRegression
# Initialize the model
log_reg_model = LogisticRegression(random_state=0)
# Train the model
log_reg_model.fit(X_train_scaled, y_train)
LogisticRegression(random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(random_state=0)
# Make predictions on the test set
y_pred = log_reg_model.predict(X_test_scaled)
y_pred
array([1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0,
1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0], dtype=int64)
# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
matrix = confusion_matrix(y_test, y_pred)
# Display results
print("Accuracy: ", accuracy*100)
print("Classification Report: \n", report)
print("Confusion Matrix: \n", matrix)
Accuracy: 72.91666666666666
Classification Report:
precision recall f1-score support
0 0.58 0.47 0.52 45
1 0.78 0.85 0.81 99
accuracy 0.73 144
macro avg 0.68 0.66 0.67 144
weighted avg 0.72 0.73 0.72 144
Confusion Matrix:
[[21 24]
[15 84]]
cm = confusion_matrix(y_test,y_pred)
ax = sns.heatmap(cm, cmap='flare',annot=True, fmt='d')
plt.xlabel("Predicted Class",fontsize=12)
plt.ylabel("True Class",fontsize=12)
plt.title("Confusion Matrix",fontsize=12)
plt.show()
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# Initialize and train the models
models = {
"Logistic Regression" : LogisticRegression(random_state=0),
"Decision Tree" : DecisionTreeClassifier(random_state=0),
"Random Forest" : RandomForestClassifier(random_state=0),
"SVC" : SVC(random_state=0)
}
for key, val in models.items():
print(key, "=", val)
Logistic Regression = LogisticRegression(random_state=0) Decision Tree = DecisionTreeClassifier(random_state=0) Random Forest = RandomForestClassifier(random_state=0) SVC = SVC(random_state=0)
for model_name, model in models.items():
# Training and prediction
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
matrix = confusion_matrix(y_test, y_pred)
# Display results
print(model_name)
print("Accuracy: ", accuracy*100)
print("Classification Report: \n", report)
plt.figure(figsize=(5,3))
#sns.heatmap(matrix, annot=True)
sns.heatmap(matrix, cmap='flare',annot=True, fmt='d')
plt.xlabel("Predicted Class",fontsize=10)
plt.ylabel("True Class",fontsize=10)
plt.title("Confusion Matrix",fontsize=10)
plt.show()
print("\n")
Logistic Regression
Accuracy: 72.91666666666666
Classification Report:
precision recall f1-score support
0 0.58 0.47 0.52 45
1 0.78 0.85 0.81 99
accuracy 0.73 144
macro avg 0.68 0.66 0.67 144
weighted avg 0.72 0.73 0.72 144
Decision Tree
Accuracy: 74.30555555555556
Classification Report:
precision recall f1-score support
0 0.58 0.62 0.60 45
1 0.82 0.80 0.81 99
accuracy 0.74 144
macro avg 0.70 0.71 0.71 144
weighted avg 0.75 0.74 0.75 144
Random Forest
Accuracy: 82.63888888888889
Classification Report:
precision recall f1-score support
0 0.76 0.64 0.70 45
1 0.85 0.91 0.88 99
accuracy 0.83 144
macro avg 0.81 0.78 0.79 144
weighted avg 0.82 0.83 0.82 144
SVC
Accuracy: 74.30555555555556
Classification Report:
precision recall f1-score support
0 0.62 0.47 0.53 45
1 0.78 0.87 0.82 99
accuracy 0.74 144
macro avg 0.70 0.67 0.68 144
weighted avg 0.73 0.74 0.73 144
import matplotlib.pyplot as plt
# Initialize and train the Random Forest model
radom_forest_model = RandomForestClassifier(random_state=0)
radom_forest_model.fit(X_train_scaled, y_train)
# Get feature importances
feature_importances = radom_forest_model.feature_importances_
# Create a DataFrame to store feature names and their importance scores
feature_importances_df = pd.DataFrame({'Features': X_train.columns, 'Importance': feature_importances})
# Sort features by importance in descending order
feature_importances_df.sort_values(by='Importance', ascending=False, inplace=True)
# Plot the feature importance
sns.barplot(x='Importance', y='Features', data=feature_importances_df, palette='viridis')
<Axes: xlabel='Importance', ylabel='Features'>
selected_features = ['Engine_Health', 'Vibration_Level', 'Rotational_Speed', 'Oil_Temperature']
# Split the data into training and testing sets
X_train_, X_test_, y_train_, y_test_ = train_test_split(data[selected_features], labels.squeeze(), test_size=0.2,
random_state=0)
# Standardize our training data
scaler = StandardScaler()
X_train_scaled_ = scaler.fit_transform(X_train_.select_dtypes(include=['number']))
X_test_scaled_ = scaler.transform(X_test_.select_dtypes(include=['number']))
# Train, Test, and Evaluate Model
for model_name, model in models.items():
# Training and prediction
model.fit(X_train_scaled_, y_train_)
y_pred = model.predict(X_test_scaled_)
# Evaluate the model
accuracy = accuracy_score(y_test_, y_pred)
report = classification_report(y_test_, y_pred)
matrix = confusion_matrix(y_test_, y_pred)
# Display results
print(model_name)
print("Accuracy: ", accuracy*100)
print("Classification Report: \n", report)
plt.figure(figsize=(4,2))
sns.heatmap(matrix, cmap='flare',annot=True, fmt='d')
plt.xlabel("Predicted Class",fontsize=10)
plt.ylabel("True Class",fontsize=10)
plt.title("Confusion Matrix",fontsize=10)
plt.show()
print("\n")
Logistic Regression
Accuracy: 71.52777777777779
Classification Report:
precision recall f1-score support
0 0.56 0.40 0.47 45
1 0.76 0.86 0.81 99
accuracy 0.72 144
macro avg 0.66 0.63 0.64 144
weighted avg 0.70 0.72 0.70 144
Decision Tree
Accuracy: 74.30555555555556
Classification Report:
precision recall f1-score support
0 0.57 0.69 0.63 45
1 0.84 0.77 0.80 99
accuracy 0.74 144
macro avg 0.71 0.73 0.72 144
weighted avg 0.76 0.74 0.75 144
Random Forest
Accuracy: 80.55555555555556
Classification Report:
precision recall f1-score support
0 0.70 0.67 0.68 45
1 0.85 0.87 0.86 99
accuracy 0.81 144
macro avg 0.77 0.77 0.77 144
weighted avg 0.80 0.81 0.80 144
SVC
Accuracy: 77.77777777777779
Classification Report:
precision recall f1-score support
0 0.68 0.56 0.61 45
1 0.81 0.88 0.84 99
accuracy 0.78 144
macro avg 0.74 0.72 0.73 144
weighted avg 0.77 0.78 0.77 144
from sklearn.model_selection import GridSearchCV
# Define the parameter grid to search
param_grid = {
'n_estimators': [50, 100, 150],
'max_depth' : [None, 10, 20],
'min_samples_split' : [2,5,10],
'min_samples_leaf': [1, 2, 4]
}
# Initialize the Random Forest model
radom_forest_model = RandomForestClassifier(random_state=0)
# Perform Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=radom_forest_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled_, y_train_)
# Get the best hyperparameters
best_param = grid_search.best_params_
best_param
{'max_depth': None,
'min_samples_leaf': 4,
'min_samples_split': 10,
'n_estimators': 150}
# Train the model with the best hyperparameters
best_radom_forest_model = RandomForestClassifier(random_state=0, **best_param)
best_radom_forest_model.fit(X_train_scaled_, y_train_)
# Make predictions on the test set
y_pred = best_radom_forest_model.predict(X_test_scaled_)
# Evaluate the model
accuracy = accuracy_score(y_test_, y_pred)
report = classification_report(y_test_, y_pred)
matrix = confusion_matrix(y_test_, y_pred)
# Display results
print(model_name)
print("Accuracy: ", accuracy*100)
print("Classification Report: \n", report)
plt.figure(figsize=(4,2))
sns.heatmap(matrix, cmap='flare',annot=True, fmt='d')
plt.xlabel("Predicted Class",fontsize=10)
plt.ylabel("True Class",fontsize=10)
plt.title("Confusion Matrix",fontsize=10)
plt.show()
print("\n")
SVC
Accuracy: 84.02777777777779
Classification Report:
precision recall f1-score support
0 0.79 0.67 0.72 45
1 0.86 0.92 0.89 99
accuracy 0.84 144
macro avg 0.82 0.79 0.81 144
weighted avg 0.84 0.84 0.84 144
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Initialize linear regression model
lin_reg_model = LinearRegression()
labels = data[['Remaining_used_life']]
features = data.drop(['Maintenance_Needed','Unnamed: 0','Engine_ID','Timestamp', 'Remaining_used_life'], axis=1)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels.squeeze(), test_size=0.2, random_state=0)
# Standardize our training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.select_dtypes(include=['number']))
X_test_scaled = scaler.transform(X_test.select_dtypes(include=['number']))
# Training
lin_reg_model.fit(X_train_scaled, y_train)
# Prediction on the training and testing set
y_train_pred = lin_reg_model.predict(X_train_scaled)
y_test_pred = lin_reg_model.predict(X_test_scaled)
# Evaluate the model
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
train_r2_score = r2_score(y_train, y_train_pred)
test_r2_score = r2_score(y_test, y_test_pred)
print("Train RMSE: ", train_rmse)
print("Test RMSE: ", test_rmse)
print("Train R^2: ", train_r2_score)
print("Test R^2: ", test_r2_score)
Train RMSE: 0.47154438446230085 Test RMSE: 0.8599072583216776 Train R^2: 0.9819079382535665 Test R^2: 0.9276613414587991